Data Wrangling

Author

Agam Safaruddin

Note

Most of my notes of studying this chapter is done in ‘r-data-wrangling.ipynb’ (see workspace content). Just in case it is necessary to show it.

1 Requirements

1.1 Libraries

Just simply get every library

library(readxl)
library(lubridate)
#> 
#> Attache Paket: 'lubridate'
#> Die folgenden Objekte sind maskiert von 'package:base':
#> 
#>     date, intersect, setdiff, union
library(tidyverse)
#> ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
#> ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
#> ✔ tibble  3.1.6     ✔ dplyr   1.0.8
#> ✔ tidyr   1.2.0     ✔ stringr 1.4.0
#> ✔ readr   2.1.2     ✔ forcats 0.5.1
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ lubridate::as.difftime() masks base::as.difftime()
#> ✖ lubridate::date()        masks base::date()
#> ✖ dplyr::filter()          masks stats::filter()
#> ✖ lubridate::intersect()   masks base::intersect()
#> ✖ dplyr::lag()             masks stats::lag()
#> ✖ lubridate::setdiff()     masks base::setdiff()
#> ✖ lubridate::union()       masks base::union()
library(readr)
library(ggplot2)
library(knitr)
library(RSQLite)
#> Warning: Paket 'RSQLite' wurde unter R Version 4.2.3 erstellt
library(rvest)     # HTML Hacking & Web Scraping
#> 
#> Attache Paket: 'rvest'
#> Das folgende Objekt ist maskiert 'package:readr':
#> 
#>     guess_encoding
library(xopen)     # Quickly opening URLs
#> Warning: Paket 'xopen' wurde unter R Version 4.2.3 erstellt
library(jsonlite)  # converts JSON files to R objects
#> 
#> Attache Paket: 'jsonlite'
#> Das folgende Objekt ist maskiert 'package:purrr':
#> 
#>     flatten
library(glue)      # concatenate strings
library(stringi)   # character string/text processing
library(httr)
library(stringr)
library(data.table)
#> 
#> Attache Paket: 'data.table'
#> Die folgenden Objekte sind maskiert von 'package:dplyr':
#> 
#>     between, first, last
#> Das folgende Objekt ist maskiert 'package:purrr':
#> 
#>     transpose
#> Die folgenden Objekte sind maskiert von 'package:lubridate':
#> 
#>     hour, isoweek, mday, minute, month, quarter, second, wday, week,
#>     yday, year
library(vroom)

2 Challenge 1

  • using the reduced_patent_dataset
  • Patents_DB_dictionary_… .xlsx is the information about the data
  • What US company or corporation has the most patents? List 10 US companies with the most assigned/granted patents

2.1 Data import

import from patent.tsv

  • Creating a table, and then importing from .tsv into the table.
  • \ t is for tab-delimited files (.tsv). Seperating columns by their tabbed white spaces. (Import to excel to clearly see it)
  • skip = 1 the first row, becuase they are just titles
  • common column values’s name are made the same, to match the data variables (eg: id)
col_types <- list(
  id = col_character(),
    date = col_date("%Y-%m-%d"),
    num_claims = col_double(),
  type = col_character(),
  number = col_character(),
  country = col_character(),
  
  abstract = col_character(),
  title = col_character(),
  kind = col_character(),
  
  filename = col_character(),
  withdrawn = col_double()
)

patent_tbl <- vroom(
            file       = "Patent_data_reduced/Patent_data_reduced/patent.tsv", 
            skip = 1,
            delim      = "\t", 
            col_names  = names(col_types),
            col_types  = col_types,
            na         = c("", "NA", "NULL")
        )

import from patent_assignee.tsv

col_types_patent_assignee <- list(
  id = col_character(),
  assignee = col_character()
)

patent_assignee_tbl <- vroom(
            file       = "Patent_data_reduced/Patent_data_reduced/patent_assignee.tsv", 
            skip = 1,
            delim      = "\t", 
            col_names  = names(col_types_patent_assignee),
            col_types  = col_types_patent_assignee,
            na         = c("", "NA", "NULL")
        )

import from assignee.tsv

col_types_assignee <- list(
  assignee = col_character(),
    type = col_character(),
    organization = col_character()
)

assignee_tbl <- vroom(
            file       = "Patent_data_reduced/Patent_data_reduced/assignee.tsv", 
            skip = 1,
            delim      = "\t", 
            col_names  = names(col_types_assignee),
            col_types  = col_types_assignee,
            na         = c("", "NA", "NULL")
        )

import from uspc.tsv

col_types_uspc <- list(
  id = col_character(),
    mainclass_id = col_character(),
    sequence = col_double()
)

uspc_tbl <- vroom(
            file       = "Patent_data_reduced/Patent_data_reduced/uspc.tsv", 
            skip = 1,
            delim      = "\t", 
            col_names  = names(col_types_uspc),
            col_types  = col_types_uspc,
            na         = c("", "NA", "NULL")
        )

2.2 See results

patent_tbl
patent_assignee_tbl
assignee_tbl
uspc_tbl

2.3 Convert to data.table

# patent Data ----
setDT(patent_tbl)

class(patent_tbl)
#> [1] "data.table" "data.frame"
patent_tbl %>% glimpse()
#> Rows: 327,014
#> Columns: 3
#> $ id         <chr> "8621662", "8621663", "8621664", "8621665", "8621666", "862…
#> $ date       <date> 2014-01-07, 2014-01-07, 2014-01-07, 2014-01-07, 2014-01-07…
#> $ num_claims <dbl> 11, 6, 10, 18, 7, 9, 21, 19, 8, 18, 7, 15, 15, 15, 18, 7, 1…
# patent-assignee Data ----
setDT(patent_assignee_tbl)

patent_assignee_tbl %>% glimpse()
#> Rows: 315,910
#> Columns: 2
#> $ id       <chr> "8709412", "8636251", "8899346", "8700141", "8724986", "D6998…
#> $ assignee <chr> "org_MPhnVOTFsXybN0auC647", "org_v7VisXnmZZEEUMeHhW6y", "org_…
# assignee Data ----
setDT(assignee_tbl)
assignee_tbl %>% glimpse()
#> Rows: 47,011
#> Columns: 3
#> $ assignee     <chr> "org_004j997jM9yEdS7z4ReD", "org_005hVGA5JMOZsS0xOhGa", "…
#> $ type         <chr> "3", "3", "2", "2", "3", "3", "2", "3", "2", "3", "2", "2…
#> $ organization <chr> "University of Basel", "Zetkama Spólka Akcyjna", "Mirabil…
# uspc Data
setDT(uspc_tbl)
uspc_tbl %>% glimpse()
#> Rows: 815,743
#> Columns: 3
#> $ id           <chr> "8829273", "8623780", "8904894", "8794165", "8773920", "8…
#> $ mainclass_id <chr> "435", "502", "368", "111", "365", "128", "709", "707", "…
#> $ sequence     <dbl> 7, 2, 0, 1, 0, 4, 2, 1, 2, 2, 10, 1, 2, 4, 1, 1, 9, 0, 2,…

2.4 Data wrangling

  • combine by id
  • all.x = TRUE: all rows included
  • all.y = FALSE: only mathcing rows will be combined to x
  • uspc has the most id rows
  • patent has the 2nd most id rows
  • patent-assignee 3rd
  • assignee 4th least rows
combined_data_patent_1 <- merge(x = uspc_tbl, y = patent_tbl, 
                       by    = "id", 
                       all.x = TRUE, 
                       all.y = FALSE)


combined_data_patent_1 %>% glimpse()
#> Rows: 815,743
#> Columns: 5
#> $ id           <chr> "8621662", "8621663", "8621663", "8621664", "8621664", "8…
#> $ mainclass_id <chr> "2", "2", "351", "2", "2", "2", "2", "2", "2", "2", "2", …
#> $ sequence     <dbl> 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 3, 2, 4, 0, 2, 1, 3, 0, …
#> $ date         <date> 2014-01-07, 2014-01-07, 2014-01-07, 2014-01-07, 2014-01-…
#> $ num_claims   <dbl> 11, 6, 6, 10, 10, 18, 7, 7, 9, 21, 21, 21, 21, 21, 19, 19…
combined_data_patent_2 <- merge(x = combined_data_patent_1, y = patent_assignee_tbl, 
                       by    = "id", 
                       all.x = TRUE, 
                       all.y = FALSE)


combined_data_patent_2 %>% glimpse()
#> Rows: 848,347
#> Columns: 6
#> $ id           <chr> "8621662", "8621663", "8621663", "8621664", "8621664", "8…
#> $ mainclass_id <chr> "2", "2", "351", "2", "2", "2", "2", "2", "2", "2", "2", …
#> $ sequence     <dbl> 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 3, 2, 4, 0, 2, 1, 3, 0, …
#> $ date         <date> 2014-01-07, 2014-01-07, 2014-01-07, 2014-01-07, 2014-01-…
#> $ num_claims   <dbl> 11, 6, 6, 10, 10, 18, 7, 7, 9, 21, 21, 21, 21, 21, 19, 19…
#> $ assignee     <chr> "org_aTMUEAbUvQuADfnSfudQ", "org_FfZ2sonhh4RvKY8vYp2B", "…
combined_data_patent_3 <- merge(x = combined_data_patent_2, y = assignee_tbl, 
                       by    = "assignee", 
                       all.x = TRUE, 
                       all.y = FALSE)


combined_data_patent_3 %>% glimpse()
#> Rows: 848,347
#> Columns: 8
#> $ assignee     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ id           <chr> "8621664", "8621664", "8621665", "8621667", "8621672", "8…
#> $ mainclass_id <chr> "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "15", "…
#> $ sequence     <dbl> 0, 1, 0, 0, 0, 4, 1, 3, 2, 0, 0, 2, 1, 0, 1, 0, 1, 0, 2, …
#> $ date         <date> 2014-01-07, 2014-01-07, 2014-01-07, 2014-01-07, 2014-01-…
#> $ num_claims   <dbl> 10, 10, 18, 9, 7, 15, 15, 15, 15, 15, 1, 19, 19, 19, 18, …
#> $ type         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ organization <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…

2.5 Data filtering

  • Picking only the US companies using grepl()

  • “Inc.” (Incorporated), “Corp.” (Corporation), “LLC” (Limited Liability Company), or “Co.” (Company).

  • ignoring “Co., Ltd.”, because those are typical eastern asian companies

  • OR Use type = 2, it means US companies

#US_combined_data_patent <- combined_data_patent_3[grepl("Inc\\.|Corp\\.|LLC|Co\\.", organization) & !grepl("Co\\., Ltd\\.", organization)] 
US_combined_data_patent <- combined_data_patent_3[grepl(2, type)]

2.6 Calculation

  • Count how many times a name appeared, to see how many patents they each have. And reorder row.
number_of_patents_owned <- US_combined_data_patent[, .N, by = organization]
setnames(number_of_patents_owned, "N", "number_of_patents_owned")

number_of_patents_owned
#setorderv(number_of_patents_owned, c("number_of_patents_owned", "organization"))
#setorderv(number_of_patents_owned, -(number_of_patents_owned))
US_combined_data_patent_descending = number_of_patents_owned %>% arrange(desc(number_of_patents_owned))

2.7 Plot readable table

kable(head(US_combined_data_patent_descending, n = 10))
organization number_of_patents_owned
International Business Machines Corporation 19056
QUALCOMM Incorporated 7322
Microsoft Corporation 7195
Google Inc. 6082
Apple Inc. 5542
AT&T INTELLECTUAL PROPERTY I, L.P. 5010
General Electric Company 4646
Intel Corporation 4252
GM Global Technology Operations LLC 3920
Broadcom Corporation 3736

3 Challenge 2

  • US companies with most patents gained in August 2014

3.1 Picking out only Aug 2014

#month = month(date)
US_combined_data_patent_Aug_2014 = US_combined_data_patent[format(date, "%Y-%m") %like% "2014-08"]
US_combined_data_patent_Aug_2014

3.2 Calculation

  • Count how many times a name appeared, to see how many patents they each have. And reorder row
number_of_patents_owned_aug_2014 <- US_combined_data_patent_Aug_2014[, .N, by = organization]
setnames(number_of_patents_owned_aug_2014, "N", "number_of_patents_owned_aug_2014")

number_of_patents_owned_aug_2014
#setorderv(number_of_patents_owned, c("number_of_patents_owned", "organization"))
#setorderv(number_of_patents_owned, -(number_of_patents_owned))
US_combined_data_patent_descending_aug_2014 = number_of_patents_owned_aug_2014 %>% arrange(desc(number_of_patents_owned_aug_2014))

3.3 Plot readable table

kable(head(US_combined_data_patent_descending_aug_2014, n = 10))
organization number_of_patents_owned_aug_2014
International Business Machines Corporation 1810
Microsoft Corporation 779
QUALCOMM Incorporated 591
Google Inc. 566
Apple Inc. 501
Intel Corporation 404
AT&T INTELLECTUAL PROPERTY I, L.P. 401
Broadcom Corporation 353
General Electric Company 345
Hewlett-Packard Development Company, L.P. 341

4 Challenge 3

  • What is the most innovative tech sector (mainclass_id > 300) https://www.uspto.gov/sites/default/files/documents/caau.pdf, but mainclass_id is type character
  • Top 10 companies worldwide
  • Top 5 USPTO tech main classes

4.1 Top 10 companies worldwide

Get data with above 300 mainclass_id

#combined_data_patent_tech <- combined_data_patent_3[grepl(>300, mainclass_id)]

#above_300 = combined_data_patent_3$mainclass_id > 300

#mainclass_id_num = as.numeric(combined_data_patent_3$mainclass_id)
above_300 = 300

combined_data_patent_tech = combined_data_patent_3[grepl("^\\d+$", mainclass_id) & as.numeric(mainclass_id) > above_300]
#> Warning in eval(.massagei(isub), x, ienv): NAs durch Umwandlung erzeugt
combined_data_patent_tech

Calculation

  • Count how many times a name appeared, to see how many patents they each have. And reorder row
number_of_patents_owned_tech_worldwide <- combined_data_patent_tech[, .N, by = organization]
setnames(number_of_patents_owned_tech_worldwide, "N", "number_of_patents_owned_tech_worldwide")

number_of_patents_owned_tech_worldwide = number_of_patents_owned_tech_worldwide[organization != "NA"]
number_of_patents_owned_tech_worldwide
#setorderv(number_of_patents_owned, c("number_of_patents_owned", "organization"))
#setorderv(number_of_patents_owned, -(number_of_patents_owned))
number_of_patents_owned_tech_worldwide_descending = number_of_patents_owned_tech_worldwide %>% arrange(desc(number_of_patents_owned_tech_worldwide))

Plot readable table

kable(head(number_of_patents_owned_tech_worldwide_descending, n = 10))
organization number_of_patents_owned_tech_worldwide
International Business Machines Corporation 15948
Samsung Electronics Co., Ltd. 11469
Canon Kabushiki Kaisha 8291
Sony Corporation 7403
QUALCOMM Incorporated 7098
Microsoft Corporation 7077
Google Inc. 5908
Apple Inc. 5125
AT&T INTELLECTUAL PROPERTY I, L.P. 4985
LG Electronics Inc. 4854

4.2 Top 5 USPTO tech main classes

US_combined_data_patent_tech = combined_data_patent_tech[grepl(2, type)]
US_combined_data_patent_tech

Organizing

Top_USPTO = US_combined_data_patent_tech %>% arrange(desc(sequence))
Top_USPTO_1 = Top_USPTO[, organization := as.character(organization)]
Top_USPTO_2 = Top_USPTO[, .(sequence = max(sequence)), by = organization]
Top_USPTO_2

Plot readable table

kable(head(Top_USPTO_2, n = 5))
organization sequence
AT&T INTELLECTUAL PROPERTY I, L.P. 77
Apple Inc. 62
Alverix, Inc. 61
SONY NETWORK ENTERTAINMENT INTERNATIONAL LLC 60
Sony Electronics Inc. 58